In [102]:
import numpy as np
import pandas as pd
df=pd.read_csv(r'/Users/binodrai/Desktop/1718991964134-9073-customer_churn_data.csv')
In [103]:
df.head(5)
Out[103]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 CUST0000 Male 0 No Yes 23 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Bank transfer 49.85 1146.55 No
1 CUST0001 Female 0 Yes No 43 No No phone service DSL Yes ... Yes No Yes No Month-to-month No Mailed check 100.70 4330.10 Yes
2 CUST0002 Male 1 No No 51 Yes No DSL No ... Yes Yes No No One year No Electronic check 97.33 4963.83 Yes
3 CUST0003 Male 1 No No 72 Yes Yes DSL Yes ... Yes No No No Month-to-month No Credit card 101.38 7299.36 No
4 CUST0004 Male 1 No No 25 Yes Yes DSL No ... No Yes No Yes Month-to-month No Electronic check 52.22 1305.50 Yes

5 rows × 21 columns

In [104]:
df.tail()
Out[104]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
5875 CUST5875 Male 0 Yes Yes 71 Yes No DSL Yes ... No Yes No No Month-to-month Yes Mailed check 74.21 5268.91 No
5876 CUST5876 Male 0 No No 22 Yes Yes DSL No ... Yes Yes No Yes One year No Mailed check 65.43 1439.46 Yes
5877 CUST5877 Female 0 No No 68 No No phone service Fiber optic Yes ... Yes Yes Yes No One year Yes Bank transfer 59.78 4065.04 No
5878 CUST5878 Female 0 Yes Yes 14 No No phone service Fiber optic Yes ... Yes Yes Yes Yes Month-to-month Yes Mailed check 91.88 1286.32 No
5879 CUST5879 Female 1 Yes No 23 Yes No No No internet service ... No internet service No internet service No internet service No internet service Month-to-month No Mailed check 25.45 585.35 Yes

5 rows × 21 columns

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5880 non-null   object 
 1   gender            5880 non-null   object 
 2   SeniorCitizen     5880 non-null   int64  
 3   Partner           5880 non-null   object 
 4   Dependents        5880 non-null   object 
 5   tenure            5880 non-null   int64  
 6   PhoneService      5880 non-null   object 
 7   MultipleLines     5880 non-null   object 
 8   InternetService   5880 non-null   object 
 9   OnlineSecurity    5880 non-null   object 
 10  OnlineBackup      5880 non-null   object 
 11  DeviceProtection  5880 non-null   object 
 12  TechSupport       5880 non-null   object 
 13  StreamingTV       5880 non-null   object 
 14  StreamingMovies   5880 non-null   object 
 15  Contract          5880 non-null   object 
 16  PaperlessBilling  5880 non-null   object 
 17  PaymentMethod     5880 non-null   object 
 18  MonthlyCharges    5880 non-null   float64
 19  TotalCharges      5880 non-null   float64
 20  Churn             5880 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 964.8+ KB
In [8]:
df.describe()
Out[8]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 5880.000000 5880.000000 5880.000000 5880.000000
mean 0.500680 36.549150 70.157779 2566.813165
std 0.500042 20.909674 28.804615 1910.017743
min 0.000000 1.000000 20.000000 20.030000
25% 0.000000 18.000000 45.717500 1020.217500
50% 1.000000 37.000000 70.155000 2136.445000
75% 1.000000 55.000000 95.457500 3767.665000
max 1.000000 72.000000 119.990000 8589.600000
In [9]:
df.shape
Out[9]:
(5880, 21)
In [10]:
df.dtypes
Out[10]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object
In [11]:
df.nunique()
Out[11]:
customerID          5880
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      4439
TotalCharges        5794
Churn                  2
dtype: int64
In [23]:
df['gender'].value_counts(ascending=True)
Out[23]:
gender
Female    2930
Male      2950
Name: count, dtype: int64
In [24]:
df.isnull().sum()  #detect missing value in dataset
Out[24]:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
In [25]:
df.nunique() #to know unique entries in categoraical column,understanding range of categories present
Out[25]:
customerID          5880
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      4439
TotalCharges        5794
Churn                  2
dtype: int64
In [27]:
df.columns
Out[27]:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
In [36]:
correlation = df['MonthlyCharges'].corr(df['TotalCharges']).round(5)#only between numerical columns
print(f"Correlation between MonthlyCharges and TotalCharges: {correlation}")
Correlation between MonthlyCharges and TotalCharges: 0.5567
In [41]:
drop_column=df.drop('PaperlessBilling',axis=1)#axis 1 represent column have to specify drop in column or row
In [42]:
df.dtypes
Out[42]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object
In [101]:
sort_Totalcharges=df.sort_values(by='TotalCharges',ascending=False)# can i show just total charges?
a=sort_Totalcharges['TotalCharges']
a.head()
Out[101]:
1765    8589.60
5082    8505.09
410     8498.16
1362    8478.00
5349    8449.20
Name: TotalCharges, dtype: float64
In [49]:
df.sample(2)
Out[49]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
2190 CUST2190 Female 0 Yes No 9 Yes No No No internet service ... No internet service No internet service No internet service No internet service One year Yes Mailed check 38.31 344.79 1
3221 CUST3221 Female 0 Yes Yes 65 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Two year No Credit card 103.71 6741.15 1

2 rows × 21 columns

In [52]:
grouped=df.groupby('gender').agg({'MonthlyCharges':'mean','tenure':'count'})
print(grouped)
        MonthlyCharges  tenure
gender                        
Female       69.888218    2930
Male         70.425512    2950
In [53]:
df.fillna(value=0, inplace=True)  # Replace NaN with 0
print(df)
     customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0      CUST0000    Male              0      No        Yes      23   
1      CUST0001  Female              0     Yes         No      43   
2      CUST0002    Male              1      No         No      51   
3      CUST0003    Male              1      No         No      72   
4      CUST0004    Male              1      No         No      25   
...         ...     ...            ...     ...        ...     ...   
5875   CUST5875    Male              0     Yes        Yes      71   
5876   CUST5876    Male              0      No         No      22   
5877   CUST5877  Female              0      No         No      68   
5878   CUST5878  Female              0     Yes        Yes      14   
5879   CUST5879  Female              1     Yes         No      23   

     PhoneService     MultipleLines InternetService       OnlineSecurity  ...  \
0              No  No phone service              No  No internet service  ...   
1              No  No phone service             DSL                  Yes  ...   
2             Yes                No             DSL                   No  ...   
3             Yes               Yes             DSL                  Yes  ...   
4             Yes               Yes             DSL                   No  ...   
...           ...               ...             ...                  ...  ...   
5875          Yes                No             DSL                  Yes  ...   
5876          Yes               Yes             DSL                   No  ...   
5877           No  No phone service     Fiber optic                  Yes  ...   
5878           No  No phone service     Fiber optic                  Yes  ...   
5879          Yes                No              No  No internet service  ...   

         DeviceProtection          TechSupport          StreamingTV  \
0     No internet service  No internet service  No internet service   
1                     Yes                   No                  Yes   
2                     Yes                  Yes                   No   
3                     Yes                   No                   No   
4                      No                  Yes                   No   
...                   ...                  ...                  ...   
5875                   No                  Yes                   No   
5876                  Yes                  Yes                   No   
5877                  Yes                  Yes                  Yes   
5878                  Yes                  Yes                  Yes   
5879  No internet service  No internet service  No internet service   

          StreamingMovies        Contract PaperlessBilling     PaymentMethod  \
0     No internet service  Month-to-month              Yes     Bank transfer   
1                      No  Month-to-month               No      Mailed check   
2                      No        One year               No  Electronic check   
3                      No  Month-to-month               No       Credit card   
4                     Yes  Month-to-month               No  Electronic check   
...                   ...             ...              ...               ...   
5875                   No  Month-to-month              Yes      Mailed check   
5876                  Yes        One year               No      Mailed check   
5877                   No        One year              Yes     Bank transfer   
5878                  Yes  Month-to-month              Yes      Mailed check   
5879  No internet service  Month-to-month               No      Mailed check   

     MonthlyCharges  TotalCharges  Churn  
0             49.85       1146.55      0  
1            100.70       4330.10      1  
2             97.33       4963.83      1  
3            101.38       7299.36      0  
4             52.22       1305.50      1  
...             ...           ...    ...  
5875          74.21       5268.91      0  
5876          65.43       1439.46      1  
5877          59.78       4065.04      0  
5878          91.88       1286.32      0  
5879          25.45        585.35      1  

[5880 rows x 21 columns]
In [54]:
df.isna()
Out[54]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5875 False False False False False False False False False False ... False False False False False False False False False False
5876 False False False False False False False False False False ... False False False False False False False False False False
5877 False False False False False False False False False False ... False False False False False False False False False False
5878 False False False False False False False False False False ... False False False False False False False False False False
5879 False False False False False False False False False False ... False False False False False False False False False False

5880 rows × 21 columns

In [56]:
df.fillna(value=0, inplace=True)  # Replace NaN with 0
print(df) #there no nan value present here
     customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0      CUST0000    Male              0      No        Yes      23   
1      CUST0001  Female              0     Yes         No      43   
2      CUST0002    Male              1      No         No      51   
3      CUST0003    Male              1      No         No      72   
4      CUST0004    Male              1      No         No      25   
...         ...     ...            ...     ...        ...     ...   
5875   CUST5875    Male              0     Yes        Yes      71   
5876   CUST5876    Male              0      No         No      22   
5877   CUST5877  Female              0      No         No      68   
5878   CUST5878  Female              0     Yes        Yes      14   
5879   CUST5879  Female              1     Yes         No      23   

     PhoneService     MultipleLines InternetService       OnlineSecurity  ...  \
0              No  No phone service              No  No internet service  ...   
1              No  No phone service             DSL                  Yes  ...   
2             Yes                No             DSL                   No  ...   
3             Yes               Yes             DSL                  Yes  ...   
4             Yes               Yes             DSL                   No  ...   
...           ...               ...             ...                  ...  ...   
5875          Yes                No             DSL                  Yes  ...   
5876          Yes               Yes             DSL                   No  ...   
5877           No  No phone service     Fiber optic                  Yes  ...   
5878           No  No phone service     Fiber optic                  Yes  ...   
5879          Yes                No              No  No internet service  ...   

         DeviceProtection          TechSupport          StreamingTV  \
0     No internet service  No internet service  No internet service   
1                     Yes                   No                  Yes   
2                     Yes                  Yes                   No   
3                     Yes                   No                   No   
4                      No                  Yes                   No   
...                   ...                  ...                  ...   
5875                   No                  Yes                   No   
5876                  Yes                  Yes                   No   
5877                  Yes                  Yes                  Yes   
5878                  Yes                  Yes                  Yes   
5879  No internet service  No internet service  No internet service   

          StreamingMovies        Contract PaperlessBilling     PaymentMethod  \
0     No internet service  Month-to-month              Yes     Bank transfer   
1                      No  Month-to-month               No      Mailed check   
2                      No        One year               No  Electronic check   
3                      No  Month-to-month               No       Credit card   
4                     Yes  Month-to-month               No  Electronic check   
...                   ...             ...              ...               ...   
5875                   No  Month-to-month              Yes      Mailed check   
5876                  Yes        One year               No      Mailed check   
5877                   No        One year              Yes     Bank transfer   
5878                  Yes  Month-to-month              Yes      Mailed check   
5879  No internet service  Month-to-month               No      Mailed check   

     MonthlyCharges  TotalCharges  Churn  
0             49.85       1146.55      0  
1            100.70       4330.10      1  
2             97.33       4963.83      1  
3            101.38       7299.36      0  
4             52.22       1305.50      1  
...             ...           ...    ...  
5875          74.21       5268.91      0  
5876          65.43       1439.46      1  
5877          59.78       4065.04      0  
5878          91.88       1286.32      0  
5879          25.45        585.35      1  

[5880 rows x 21 columns]
In [57]:
df['Partner'].replace({'Yes': 1, 'No': 0}, inplace=True)  # Replace 'Yes' with 1 and 'No' with 0 in 'Partner' column
print(df['Partner'])
0       0
1       1
2       0
3       0
4       0
       ..
5875    1
5876    0
5877    0
5878    1
5879    1
Name: Partner, Length: 5880, dtype: int64
In [154]:
df.describe()
Out[154]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 5880.000000 5880.000000 5880.000000 5880.000000
mean 0.500680 36.549150 70.157779 2565.315153
std 0.500042 20.909674 28.804615 1905.670601
min 0.000000 1.000000 20.000000 20.030000
25% 0.000000 18.000000 45.717500 1020.217500
50% 1.000000 37.000000 70.155000 2136.445000
75% 1.000000 55.000000 95.457500 3767.665000
max 1.000000 72.000000 119.990000 7888.836250
In [155]:
df.index
Out[155]:
RangeIndex(start=0, stop=5880, step=1)
In [153]:
import plotly.graph_objects as go

# Define index for x-axis
x = df.index

# Create line plots for each column
fig = go.Figure()

# Plot SeniorCitizen
fig.add_trace(go.Scatter(x=x, y=df['SeniorCitizen'], mode='lines+markers', name='SeniorCitizen'))

# Plot Tenure
fig.add_trace(go.Scatter(x=x, y=df['tenure'], mode='lines+markers', name='Tenure'))

# Plot MonthlyCharges
fig.add_trace(go.Scatter(x=x, y=df['MonthlyCharges'], mode='lines+markers', name='MonthlyCharges'))

# Plot TotalCharges
fig.add_trace(go.Scatter(x=x, y=df['TotalCharges'], mode='lines+markers', name='TotalCharges'))

# Customize plot
fig.update_layout(title='DataFrame Line Plots', xaxis_title='Index', yaxis_title='Values', legend_title='Columns')

# Show plot
fig.show()
In [159]:
import pandas as pd
import matplotlib.pyplot as plt
# Cross-tabulation of Gender vs Review
gender_review_ct = pd.crosstab(df['gender'], df['InternetService'])
print(gender_review_ct)
gender_review_ct.plot(kind='bar', stacked=True, title='Usage of Internet Services By Gender')
plt.xlabel('gender')
plt.ylabel('Count')
plt.show()
InternetService  DSL  Fiber optic    No
gender                                 
Female           938          957  1035
Male             998          958   994
No description has been provided for this image
In [151]:
df.rename(columns={'gender':'Sex'})
Out[151]:
customerID Sex SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 CUST0000 Male 0 No Yes 23 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Bank transfer 49.85 1146.55 No
1 CUST0001 Female 0 Yes No 43 No No phone service DSL Yes ... Yes No Yes No Month-to-month No Mailed check 100.70 4330.10 Yes
2 CUST0002 Male 1 No No 51 Yes No DSL No ... Yes Yes No No One year No Electronic check 97.33 4963.83 Yes
3 CUST0003 Male 1 No No 72 Yes Yes DSL Yes ... Yes No No No Month-to-month No Credit card 101.38 7299.36 No
4 CUST0004 Male 1 No No 25 Yes Yes DSL No ... No Yes No Yes Month-to-month No Electronic check 52.22 1305.50 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5875 CUST5875 Male 0 Yes Yes 71 Yes No DSL Yes ... No Yes No No Month-to-month Yes Mailed check 74.21 5268.91 No
5876 CUST5876 Male 0 No No 22 Yes Yes DSL No ... Yes Yes No Yes One year No Mailed check 65.43 1439.46 Yes
5877 CUST5877 Female 0 No No 68 No No phone service Fiber optic Yes ... Yes Yes Yes No One year Yes Bank transfer 59.78 4065.04 No
5878 CUST5878 Female 0 Yes Yes 14 No No phone service Fiber optic Yes ... Yes Yes Yes Yes Month-to-month Yes Mailed check 91.88 1286.32 No
5879 CUST5879 Female 1 Yes No 23 Yes No No No internet service ... No internet service No internet service No internet service No internet service Month-to-month No Mailed check 25.45 585.35 Yes

5880 rows × 21 columns

In [105]:
df_2= df.loc[:,['MonthlyCharges','TotalCharges']]
df_2.head()
Out[105]:
MonthlyCharges TotalCharges
0 49.85 1146.55
1 100.70 4330.10
2 97.33 4963.83
3 101.38 7299.36
4 52.22 1305.50
In [106]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_2)
df_scaled = scaler.transform(df_2)
df_scaled
Out[106]:
array([[-0.7050782 , -0.74364961],
       [ 1.06041406,  0.92325676],
       [ 0.94340897,  1.25507771],
       ...,
       [-0.36031246,  0.78447138],
       [ 0.75418708, -0.67046606],
       [-1.55223673, -1.03749386]])
In [63]:
df_scaled = pd.DataFrame(df_scaled)
In [66]:
df_scaled.describe().round(0)
Out[66]:
0 1
count 5880.0 5880.0
mean 0.0 0.0
std 1.0 1.0
min -2.0 -1.0
25% -1.0 -1.0
50% -0.0 -0.0
75% 1.0 1.0
max 2.0 3.0
In [109]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
minmax.fit(df_2)#also renaming data frame
df_minmax = minmax.transform(df_2)
df_minmax
Out[109]:
array([[0.29852985, 0.13145584],
       [0.80708071, 0.50295056],
       [0.77337734, 0.57690176],
       ...,
       [0.39783978, 0.47202018],
       [0.71887189, 0.14776587],
       [0.05450545, 0.0659683 ]])
In [110]:
df_minmax=pd.DataFrame(df_minmax)
df_minmax.describe()
Out[110]:
0 1
count 5880.000000 5880.000000
mean 0.501628 0.297189
std 0.288075 0.222884
min 0.000000 0.000000
25% 0.257201 0.116714
50% 0.501600 0.246969
75% 0.754650 0.437319
max 1.000000 1.000000
In [112]:
df.head()
Out[112]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 CUST0000 Male 0 No Yes 23 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Bank transfer 49.85 1146.55 No
1 CUST0001 Female 0 Yes No 43 No No phone service DSL Yes ... Yes No Yes No Month-to-month No Mailed check 100.70 4330.10 Yes
2 CUST0002 Male 1 No No 51 Yes No DSL No ... Yes Yes No No One year No Electronic check 97.33 4963.83 Yes
3 CUST0003 Male 1 No No 72 Yes Yes DSL Yes ... Yes No No No Month-to-month No Credit card 101.38 7299.36 No
4 CUST0004 Male 1 No No 25 Yes Yes DSL No ... No Yes No Yes Month-to-month No Electronic check 52.22 1305.50 Yes

5 rows × 21 columns

In [113]:
method_counts = df['PaymentMethod'].value_counts()
In [114]:
method_counts
Out[114]:
PaymentMethod
Credit card         1503
Electronic check    1488
Mailed check        1462
Bank transfer       1427
Name: count, dtype: int64
In [117]:
df.dtypes
Out[117]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object
In [118]:
pd.get_dummies(df['PaymentMethod'],drop_first=True).astype(int).head(10)
Out[118]:
Credit card Electronic check Mailed check
0 0 0 0
1 0 0 1
2 0 1 0
3 1 0 0
4 0 1 0
5 1 0 0
6 0 0 0
7 0 0 1
8 0 0 1
9 0 1 0
In [119]:
pd.get_dummies(df['PaymentMethod'],).astype(int).head(10)
Out[119]:
Bank transfer Credit card Electronic check Mailed check
0 1 0 0 0
1 0 0 0 1
2 0 0 1 0
3 0 1 0 0
4 0 0 1 0
5 0 1 0 0
6 1 0 0 0
7 0 0 0 1
8 0 0 0 1
9 0 0 1 0
In [120]:
df
Out[120]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 CUST0000 Male 0 No Yes 23 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Bank transfer 49.85 1146.55 No
1 CUST0001 Female 0 Yes No 43 No No phone service DSL Yes ... Yes No Yes No Month-to-month No Mailed check 100.70 4330.10 Yes
2 CUST0002 Male 1 No No 51 Yes No DSL No ... Yes Yes No No One year No Electronic check 97.33 4963.83 Yes
3 CUST0003 Male 1 No No 72 Yes Yes DSL Yes ... Yes No No No Month-to-month No Credit card 101.38 7299.36 No
4 CUST0004 Male 1 No No 25 Yes Yes DSL No ... No Yes No Yes Month-to-month No Electronic check 52.22 1305.50 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5875 CUST5875 Male 0 Yes Yes 71 Yes No DSL Yes ... No Yes No No Month-to-month Yes Mailed check 74.21 5268.91 No
5876 CUST5876 Male 0 No No 22 Yes Yes DSL No ... Yes Yes No Yes One year No Mailed check 65.43 1439.46 Yes
5877 CUST5877 Female 0 No No 68 No No phone service Fiber optic Yes ... Yes Yes Yes No One year Yes Bank transfer 59.78 4065.04 No
5878 CUST5878 Female 0 Yes Yes 14 No No phone service Fiber optic Yes ... Yes Yes Yes Yes Month-to-month Yes Mailed check 91.88 1286.32 No
5879 CUST5879 Female 1 Yes No 23 Yes No No No internet service ... No internet service No internet service No internet service No internet service Month-to-month No Mailed check 25.45 585.35 Yes

5880 rows × 21 columns

In [124]:
df.describe().round(2)
Out[124]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 5880.0 5880.00 5880.00 5880.00
mean 0.5 36.55 70.16 2566.81
std 0.5 20.91 28.80 1910.02
min 0.0 1.00 20.00 20.03
25% 0.0 18.00 45.72 1020.22
50% 1.0 37.00 70.16 2136.44
75% 1.0 55.00 95.46 3767.66
max 1.0 72.00 119.99 8589.60
In [126]:
upper_limit = df['tenure'].mean() + 3*df['tenure'].std()
lower_limit = df['tenure'].mean() - 3*df['tenure'].std()
In [127]:
upper_limit
Out[127]:
99.27817099498411
In [128]:
lower_limit
Out[128]:
-26.179871675256223
In [129]:
Outlier_zscore = df[(df['tenure'] >=upper_limit)|(df['tenure'] <=lower_limit)]
Outlier_zscore
Out[129]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn

0 rows × 21 columns

In [130]:
upper_limit = df['TotalCharges'].mean() + 3*df['TotalCharges'].std()
lower_limit = df['TotalCharges'].mean() - 3*df['TotalCharges'].std()
In [131]:
upper_limit
Out[131]:
8296.866392560543
In [132]:
lower_limit
Out[132]:
-3163.2400626285716
In [136]:
Outlier_zscore = df[(df['TotalCharges'] >=upper_limit)|(df['TotalCharges'] <=lower_limit)]
Outlier_zscore
Out[136]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
410 CUST0410 Male 1 No Yes 72 No No phone service Fiber optic Yes ... Yes No No No Month-to-month Yes Credit card 118.03 8498.16 Yes
764 CUST0764 Male 1 Yes No 71 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Electronic check 118.58 8419.18 No
1006 CUST1006 Female 0 Yes No 72 No No phone service DSL No ... No Yes Yes Yes Two year Yes Mailed check 116.53 8390.16 No
1362 CUST1362 Male 0 No No 72 Yes Yes DSL No ... No Yes Yes No Two year No Credit card 117.75 8478.00 Yes
1765 CUST1765 Female 0 Yes No 72 Yes Yes Fiber optic Yes ... Yes Yes No Yes One year Yes Bank transfer 119.30 8589.60 No
5082 CUST5082 Male 0 No No 71 Yes No Fiber optic Yes ... No No No Yes One year Yes Bank transfer 119.79 8505.09 No
5349 CUST5349 Male 1 No Yes 72 Yes No Fiber optic No ... Yes No Yes Yes Month-to-month Yes Credit card 117.35 8449.20 No
5476 CUST5476 Male 1 Yes No 71 Yes Yes DSL No ... Yes No Yes No One year Yes Mailed check 118.74 8430.54 No

8 rows × 21 columns

In [138]:
trimmed_df = df[(df['TotalCharges'] <=upper_limit) & (df['TotalCharges'] >=lower_limit)] #trimming using Z-score
trimmed_df.shape
Out[138]:
(5872, 21)
In [139]:
df=pd.read_csv(r'/Users/binodrai/Desktop/1718991964134-9073-customer_churn_data.csv')
In [141]:
# IQR METHOD ::Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['TotalCharges'].quantile(0.25)
Q3 = df['TotalCharges'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
In [142]:
lower_bound
Out[142]:
-3100.9537500000006
In [143]:
upper_bound
Out[143]:
7888.83625
In [145]:
###Outlier IQR score 
Outlier_zscore
Outlier_iqr = df[(df['TotalCharges'] >=upper_bound)|(df['TotalCharges'] <=lower_bound)]
Outlier_iqr
Out[145]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
83 CUST0083 Female 1 Yes No 71 Yes Yes Fiber optic Yes ... No Yes No No One year No Bank transfer 112.54 7990.34 Yes
410 CUST0410 Male 1 No Yes 72 No No phone service Fiber optic Yes ... Yes No No No Month-to-month Yes Credit card 118.03 8498.16 Yes
650 CUST0650 Male 0 Yes Yes 72 No No phone service Fiber optic No ... No Yes No Yes Month-to-month Yes Mailed check 111.87 8054.64 No
764 CUST0764 Male 1 Yes No 71 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Electronic check 118.58 8419.18 No
1006 CUST1006 Female 0 Yes No 72 No No phone service DSL No ... No Yes Yes Yes Two year Yes Mailed check 116.53 8390.16 No
1362 CUST1362 Male 0 No No 72 Yes Yes DSL No ... No Yes Yes No Two year No Credit card 117.75 8478.00 Yes
1598 CUST1598 Female 1 No Yes 68 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month No Electronic check 118.03 8026.04 No
1631 CUST1631 Male 1 No Yes 69 Yes Yes No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Bank transfer 115.61 7977.09 No
1765 CUST1765 Female 0 Yes No 72 Yes Yes Fiber optic Yes ... Yes Yes No Yes One year Yes Bank transfer 119.30 8589.60 No
1976 CUST1976 Male 0 No No 67 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Two year Yes Mailed check 118.63 7948.21 No
2218 CUST2218 Female 1 No Yes 70 No No phone service No No internet service ... No internet service No internet service No internet service No internet service Month-to-month Yes Electronic check 117.34 8213.80 Yes
2225 CUST2225 Male 0 Yes No 69 Yes No No No internet service ... No internet service No internet service No internet service No internet service Two year Yes Credit card 117.39 8099.91 Yes
2314 CUST2314 Male 1 No Yes 72 Yes No Fiber optic No ... No No Yes No One year Yes Bank transfer 111.85 8053.20 Yes
2818 CUST2818 Male 1 Yes No 67 Yes Yes Fiber optic Yes ... Yes Yes Yes Yes Two year Yes Electronic check 118.89 7965.63 No
3060 CUST3060 Female 1 No No 70 No No phone service Fiber optic Yes ... No Yes No No Two year No Bank transfer 117.75 8242.50 No
3302 CUST3302 Male 1 Yes Yes 68 Yes No DSL Yes ... Yes Yes No No Two year No Credit card 118.25 8041.00 Yes
3414 CUST3414 Male 1 No No 71 No No phone service DSL No ... Yes No Yes Yes Month-to-month Yes Electronic check 116.80 8292.80 No
3473 CUST3473 Female 0 No No 68 Yes Yes DSL Yes ... No No No Yes Month-to-month Yes Bank transfer 118.49 8057.32 Yes
3707 CUST3707 Female 1 No Yes 69 No No phone service Fiber optic Yes ... No No Yes No One year No Mailed check 117.38 8099.22 Yes
3775 CUST3775 Female 1 Yes Yes 67 No No phone service DSL Yes ... Yes Yes Yes No Two year Yes Bank transfer 118.91 7966.97 Yes
3789 CUST3789 Male 1 Yes No 71 No No phone service DSL No ... No Yes No Yes Two year Yes Mailed check 112.14 7961.94 No
4094 CUST4094 Female 0 Yes Yes 69 Yes Yes No No internet service ... No internet service No internet service No internet service No internet service Two year No Mailed check 119.99 8279.31 Yes
4437 CUST4437 Female 1 Yes No 72 Yes No Fiber optic No ... Yes Yes No Yes Month-to-month Yes Bank transfer 112.80 8121.60 No
4638 CUST4638 Female 0 Yes No 69 Yes Yes DSL Yes ... No No Yes No Two year No Credit card 114.42 7894.98 Yes
4718 CUST4718 Female 0 No Yes 71 Yes Yes Fiber optic Yes ... Yes Yes No No Two year No Credit card 115.53 8202.63 No
4774 CUST4774 Male 0 No Yes 68 No No phone service Fiber optic No ... No No Yes Yes One year Yes Credit card 117.36 7980.48 Yes
4872 CUST4872 Male 1 Yes Yes 69 No No phone service DSL No ... No No No Yes Two year No Credit card 116.72 8053.68 No
5082 CUST5082 Male 0 No No 71 Yes No Fiber optic Yes ... No No No Yes One year Yes Bank transfer 119.79 8505.09 No
5257 CUST5257 Male 0 No Yes 68 No No phone service DSL No ... No Yes Yes Yes One year Yes Credit card 116.22 7902.96 No
5349 CUST5349 Male 1 No Yes 72 Yes No Fiber optic No ... Yes No Yes Yes Month-to-month Yes Credit card 117.35 8449.20 No
5476 CUST5476 Male 1 Yes No 71 Yes Yes DSL No ... Yes No Yes No One year Yes Mailed check 118.74 8430.54 No
5704 CUST5704 Female 1 No Yes 71 Yes No DSL Yes ... Yes Yes No No One year No Mailed check 113.59 8064.89 No

32 rows × 21 columns

In [146]:
###tRIMMING IQR score

# Using IQR for trimming, assuming we already have lower_bound and upper_bound calculated
trimmed_df = df[(df['TotalCharges'] <=upper_bound) & (df['TotalCharges'] >=lower_bound)]

trimmed_df.shape
Out[146]:
(5848, 21)
In [147]:
##Capping using IQR
df['TotalCharges'] = np.where(
    df['TotalCharges']>upper_bound,
    upper_bound,
    np.where(
        df['TotalCharges']<lower_bound,
        lower_bound,
        df['TotalCharges']
    )
)
In [148]:
df.describe().round(2)
Out[148]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 5880.0 5880.00 5880.00 5880.00
mean 0.5 36.55 70.16 2565.32
std 0.5 20.91 28.80 1905.67
min 0.0 1.00 20.00 20.03
25% 0.0 18.00 45.72 1020.22
50% 1.0 37.00 70.16 2136.44
75% 1.0 55.00 95.46 3767.66
max 1.0 72.00 119.99 7888.84
In [ ]: